In [36]:
import requests
from bs4 import BeautifulSoup
import polars as pl
import numpy as np
import plotly.express as px
import plotly.io as pio

pio.renderers.default ='vscode'

pl.Config(tbl_cols=-1)
Out[36]:
<polars.config.Config at 0x1173501d0>
In [2]:
# every page of the search
# CB1 within 3 miles
# up to GBP700
urls = [
    f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier=OUTCODE%5E409&radius=3.0&_includeSSTC=on&index={i}&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier=CB1.html&maxPrice=700000#prop165096422"
    for i in range(0, 1200, 24)
]
In [3]:
relative_urls = []
for url in urls:
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # get the wrapper for all the properties in each page
  all_containers = soup.find_all(class_='PropertyCard_propertyCardDescriptionInner__3Vkmk')

  # loop through each propery and get the href
  # append the href relative url scrape later
  for container in all_containers:
    relative_urls.append(container.find(href=True).get('href'))
In [4]:
# check the scraped
len(relative_urls)
Out[4]:
1050
In [5]:
# check for the base url
# https://www.rightmove.co.uk/properties/164828633#/?channel=RES_BUY

# e.g. the base in this case is - https://www.rightmove.co.uk
# e.g. the relative is /properties/166143110#/?channel=RES_BUY

# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]
In [6]:
# click one of the full urls to check for working link
full_urls[0:5]
Out[6]:
['https://www.rightmove.co.uk/properties/87242181#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/165096422#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/161890835#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/166349258#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/166452245#/?channel=RES_BUY']

Class Testing¶

  • ensure it is finding the correct information and values
In [7]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/127.0.0.0 Safari/537.36"
}


test_request = requests.get(
    'https://www.rightmove.co.uk/properties/161836745#/?channel=RES_BUY',
    # full_urls[5],
    headers=headers
    )
In [8]:
test_request.status_code
Out[8]:
410
In [9]:
test_soup = BeautifulSoup(test_request.content, 'html.parser')
In [10]:
container = test_soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')

more_info_array = [cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm').text for cont in container]

more_info_template = ['council_tax', 'parking', 'garden', 'accessibility']

dict(zip(more_info_template, more_info_array))
Out[10]:
{'council_tax': 'Band: D',
 'parking': 'Off street',
 'garden': 'Yes',
 'accessibility': 'Ask agent'}
In [11]:
# price
test_soup.find(class_='_1gfnqJ3Vtd1z40MlC0MzXu').find('span').text
Out[11]:
'£700,000'
In [12]:
# street address
test_soup.find(class_='_2uQQ3SV0eMHL1P6t5ZDo2q').text
Out[12]:
'Kelvin Close'
In [13]:
property_container = test_soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')

property_info = [cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN').text for cont in property_container]

property_template = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']

dict(zip(property_template, property_info))
Out[13]:
{'property_type': 'Semi-Detached',
 'bedrooms': '3',
 'bathrooms': '2',
 'size': 'Ask agent',
 'tenure': 'Freehold'}
In [14]:
# added date
test_soup.find(class_='_2nk2x6QhNB1UrxdI5KpvaF').text
Out[14]:
'Added on 12/05/2025'

Scraping¶

In [15]:
# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]

rows = []
for i, url in enumerate(full_urls):
    # print(f"Processing URL {i+1}/{len(full_urls)}: {url}")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/127.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        soup = BeautifulSoup(response.content, 'html.parser')

        # Helper function to safely extract text
        def safe_extract(soup, class_name, default="Not found"):
            element = soup.find(class_=class_name)
            return element.text.strip() if element else default

        def safe_extract_nested(soup, class_name, nested_tag, default="Not found"):
            element = soup.find(class_=class_name)
            if element:
                nested = element.find(nested_tag)
                return nested.text.strip() if nested else default
            return default

        # Extract basic information with error handling
        address = safe_extract(soup, '_2uQQ3SV0eMHL1P6t5ZDo2q')
        added_date = safe_extract(soup, '_2nk2x6QhNB1UrxdI5KpvaF')
        price = safe_extract_nested(soup, '_1gfnqJ3Vtd1z40MlC0MzXu', 'span')

        # council tax, parking, garden, accessibility
        misc_container = soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')
        misc_values = []
        for cont in misc_container:
            value_elem = cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm')
            misc_values.append(value_elem.text.strip() if value_elem else "Not found")

        misc_keys = ['council_tax', 'parking', 'garden', 'accessibility']
        # Pad with "Not found" if we have fewer values than keys
        # this error handling was suggested by Claude AI
        while len(misc_values) < len(misc_keys):
            misc_values.append("Not found")
        misc_info_dict = dict(zip(misc_keys, misc_values))

        # property type, bedrooms, bathrooms, size, tenure
        property_container = soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')
        property_info = []
        for cont in property_container:
            value_elem = cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN')
            property_info.append(value_elem.text.strip() if value_elem else "Not found")

        property_keys = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']
        # Pad with "Not found" if we have fewer values than keys
        while len(property_info) < len(property_keys):
            property_info.append("Not found")
        property_info_dict = dict(zip(property_keys, property_info))

        # create the row dictionary with all information
        row = {
            'url': url,
            'address': address,
            'added_date': added_date,
            'price': price
        }

        # add the property and misc info to the row
        row.update(property_info_dict)
        row.update(misc_info_dict)

        # append the complete row to rows list
        rows.append(row)
        # print(f"Successfully processed: {address}")

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        continue
    except Exception as e:
        print(f"Error processing {url}: {e}")
        continue

# create DataFrame
df = pl.DataFrame(rows)
In [16]:
# inspect the dataframe
df.head()
Out[16]:
shape: (5, 13)
urladdressadded_datepriceproperty_typebedroomsbathroomssizetenurecouncil_taxparkinggardenaccessibility
strstrstrstrstrstrstrstrstrstrstrstrstr
"https://www.rightmove.co.uk/pr…"Beech Close, Little Shelford, …"Reduced on 29/08/2025""£600,000""Detached Bungalow""3""1""900 sq ft""Freehold""Band: D""Garage,Driveway""Yes""Ask agent"
"https://www.rightmove.co.uk/pr…"Manor Park, Histon""Added on 29/07/2025""£700,000""Detached""3""1""1,313 sq ft""Freehold""Band: TBC""Driveway,Off street""Private garden""Ask agent"
"https://www.rightmove.co.uk/pr…"Turvill Place, Cambridge, CB4""Reduced on 02/09/2025""£700,000""Semi-Detached""4""2""1,521 sq ft""Freehold""Band: E""Yes""Yes""Ask agent"
"https://www.rightmove.co.uk/pr…"Cambridge, Cambridgeshire""Added on 29/08/2025""£700,000""Detached""4""2""1,722 sq ft""Freehold""Band: F""Garage,Allocated""Yes""Ask agent"
"https://www.rightmove.co.uk/pr…"Sedgwick Street, Cambridge""Added on 01/09/2025""£700,000""Terraced""4""2""Ask agent""Freehold""Band: C""On street""Yes""Ask agent"

Data Cleaning¶

In [17]:
clean_df = (
    df
    .with_columns(
        pl.col('price').str.replace_all(r'£|,', '').cast(pl.Float64),
        pl.col('bedrooms').str.replace_all(r"1,835 sq ft|2,475 sq ft|Ask agent", '').cast(pl.Categorical, strict=False),
        pl.col('bathrooms').cast(pl.Categorical, strict=False),
        pl.col('size').str.replace_all(r' sq ft|,', '').cast(pl.Float64, strict=False),
        pl.col('council_tax').str.replace_all(r'Band: ', ''),
        # pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]+?)').alias('zip_code'),
        pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2})').alias('zip_code'), # postcode followed by the first 2 digits
        # pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2}.{4})').alias('zip_code'),
        pl.col('added_date').str.replace('Added on ', '').str.to_datetime(format='%d/%m/%Y', strict=False).dt.date(),
    )
    .with_columns(
        (pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedroom'),
        ((pl.col('bedrooms').cast(pl.Int64, strict=False) + pl.col('bathrooms').cast(pl.Int64, strict=False)) / pl.col('size')).alias('bed_bath_density')
    )
    .rename({'zip_code': 'postcode'})
)

clean_df.head()
Out[17]:
shape: (5, 16)
urladdressadded_datepriceproperty_typebedroomsbathroomssizetenurecouncil_taxparkinggardenaccessibilitypostcodesqft_per_bedroombed_bath_density
strstrdatef64strcatcatf64strstrstrstrstrstrf64f64
"https://www.rightmove.co.uk/pr…"Beech Close, Little Shelford, …null600000.0"Detached Bungalow""3""1"900.0"Freehold""D""Garage,Driveway""Yes""Ask agent"null300.00.004444
"https://www.rightmove.co.uk/pr…"Manor Park, Histon"2025-07-29700000.0"Detached""3""1"1313.0"Freehold""TBC""Driveway,Off street""Private garden""Ask agent"null437.6666670.003046
"https://www.rightmove.co.uk/pr…"Turvill Place, Cambridge, CB4"null700000.0"Semi-Detached""4""2"1521.0"Freehold""E""Yes""Yes""Ask agent""CB4"380.250.003945
"https://www.rightmove.co.uk/pr…"Cambridge, Cambridgeshire"2025-08-29700000.0"Detached""4""2"1722.0"Freehold""F""Garage,Allocated""Yes""Ask agent"null430.50.003484
"https://www.rightmove.co.uk/pr…"Sedgwick Street, Cambridge"2025-09-01700000.0"Terraced""4""2"null"Freehold""C""On street""Yes""Ask agent"nullnullnull
In [18]:
# check all the garden
pl.Config(tbl_rows=-1)
df['garden'].value_counts()
Out[18]:
shape: (20, 2)
gardencount
stru32
"Private garden,Back garden"2
"Patio,Private garden,Enclosed …12
"Yes"577
"Patio"11
"Communal garden,Terrace"2
"Front garden,Rear garden"4
"Front garden"1
"Front garden,Back garden"8
"Communal garden"23
"Private garden,Patio,Enclosed …8
"Rear garden"8
"Private garden"75
"Private garden,Patio"2
"Patio,Private garden"1
"Ask developer"31
"Terrace"3
"On street"1
"Back garden"8
"Ask agent"269
"Not found"4
In [19]:
# this is to export the data
# !pip install xlsxwriter
# clean_df.write_excel('right_move_CB1_3miles_700kGBP.xlsx')
In [20]:
# for google colab renderer
# !pip install -U kaleido
In [32]:
# explore the data and check the relationship between square fit and size
# also annotate the private gardens
fig = px.scatter(
    data_frame=clean_df,
    x='size',
    y='price',
    color='bedrooms',
    labels={'size' : 'sqft'},
    # trendline='ols',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show(renderer='notebook')
In [35]:
# majority of the 2 and 1 bedrooms do not have garden - check if they are apartments
fig = px.scatter(
    data_frame=clean_df,
    x='size',
    y='price',
    color=np.where(clean_df['property_type'].str.to_lowercase().str.contains('apartment').to_numpy(), 'apartment', 'rest'),
    labels={'size' : 'sqft'},
    # trendline='ols',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show(renderer='png')
No description has been provided for this image
In [23]:
# feature engineer one new feature
clean_df_2 = (
    clean_df
    .with_columns(
        (pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedrooms')
    )
)

Interpretation:

  • Average room space allocation:
    • higher values = more spacious properties relative to bedroom count
    • lower values = more compact/efficient user of space
  • Propperty layout insights:
    • for exmplem, a 2000 sqft house with 2 bedrooms = 1000 sqft/bedroom (more specious)
  • Value comparison:
    • luxury properties will have higher sqft/bedroom rations
    • start homes and apartments will have lower ratios

How this metric work?

  • filter our properties with least sqft per bedroom
  • compare properties with different bedroom counts
  • analyze pricing patterns based on space efficiency or outliers (cramped vs spacious)
In [ ]:
 
In [47]:
# interested in 2-3 bedroom
# want to have a relatively spacious space (doesn't look cramp from the inside)
# we want large total size (sqft), spacious (higher sqft_per_bedroom), and higher bedrooms (3)
# we want light pink to blue
fig = px.scatter(
    data_frame=(
        clean_df_2
        .with_columns(
            pl.col('bedrooms').cast(pl.Int64, strict=False)
        )
        .filter(
            pl.col('bedrooms').is_between(2, 3)
        )
    ),
    x='sqft_per_bedrooms',
    y='price',
    color='size',
    # trendline='ols',
    color_continuous_scale='RdBu',
    text='bedrooms'

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.9),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show(renderer='jpg')
No description has been provided for this image
In [49]:
# seems like the ideal more balanced properties are:
# >1000sqft
# > 500sqft_per_bedrooms
# 2-3 bedrooms
# between 350k to 550k
shortlisted_properties = (
    clean_df_2
    .with_columns(
        pl.col('bedrooms').cast(pl.Int64, strict=False)
    )
    .filter(
        pl.col('size').gt(1000) &
        pl.col('bedrooms').is_between(2,3) &
        pl.col('price').is_between(350000, 550000)
    )

)
In [70]:
fig = px.bar(
    shortlisted_properties,
    y='address',
    x='price',
)

# Make the y-axis labels (addresses) clickable
fig.update_layout(
    template='ggplot2',
    width=1000,
)

# Show as interactive HTML (remove renderer='png')
fig.show(renderer='png')
No description has been provided for this image
In [73]:
# shortlisted_properties.write_excel('shortlisted_properties.xlsx')
Out[73]:
<xlsxwriter.workbook.Workbook at 0x126ddc2c0>
In [48]:
# out of curiousity check the property price by postcode
# for exactly 2 or 3 bedrooms
fig = px.box(
    data_frame=clean_df_2,
    x='postcode',
    y='price',
    points='all',
    facet_col=np.where(
        clean_df_2['bedrooms'].cast(pl.Int64, strict=False).is_between(2,3).to_numpy(),
        '2 or 3 bedrooms', 'rest'
        ),
    color='bedrooms',
    title='Property Prices by Postcode: 2-3 Bedrooms vs Others'
)

fig.update_layout(
    template='ggplot2',
    width=1200,
    yaxis_tickformat='£,.0f',
    xaxis_title='Postcode',
    yaxis_title='Price (£)'
)

# Rotate postcode labels if they're crowded
fig.update_xaxes(tickangle=45)
fig.show(renderer='jpg')
No description has been provided for this image